######################################################
# CleanSurveyData.R
# This file does some basic cleaning and validation checks on the survey files to prepare for analysis.
# This also merges in the treatment variables and creates some descriptive variables
#
# note this file requires the working directory to be set to "./Merge and Clean Data/"
# Contact Ryan Jablonski, r.s.jablonski@lse.ac.uk with questions
#
# Log
# Created 2018
# Edited for APSR replication 17 August 2023 by Ryan Jablonski
######################################################

rm(list=ls(all=TRUE))

library(readstata13)

c.survey=read.dta13("../Original Survey Files/AID_MALAWI_COUNCILOR_withids.dta")
mp.survey=read.dta13("../Original Survey Files/AID_MALAWI_MP_withids.dta")


#school-specific information used for the treatments
mpschools=read.csv("./input/school_treatment_information_mp.csv", stringsAsFactors = FALSE)
lcschools=read.csv("./input/school_treatment_information_lc.csv", stringsAsFactors = FALSE)

#treatment assignment
treatments_lc=read.csv("./input/2017-02-22local-randomized.csv", stringsAsFactors = FALSE)
treatments_mp=read.csv("./input/2017-02-22-randomized_const.csv", stringsAsFactors = FALSE)

#details on the identify of schools where respendents have family attending (hand coded based on aid_53 and aid_54)
lc_childrenschools=read.csv("./input/lc_children_school_details.csv", stringsAsFactors = FALSE)
mp_childrenschools=read.csv("./input/mp_children_school_details.csv", stringsAsFactors = FALSE)

#remove factors
c.survey <- data.frame(lapply(c.survey, as.character), stringsAsFactors=FALSE)
mp.survey <- data.frame(lapply(mp.survey, as.character), stringsAsFactors=FALSE)


c.survey$current_party=ifelse(grepl("mcp",c.survey$aid_71, ignore.case = T), "mcp", NA)
c.survey$current_party=ifelse(grepl("congress",c.survey$aid_71, ignore.case = T), "mcp", c.survey$current_party)
c.survey$current_party=ifelse(grepl("independent",c.survey$aid_71, ignore.case = T), "independent", c.survey$current_party)
c.survey$current_party=ifelse(grepl("UDF",c.survey$aid_71, ignore.case = T), "udf", c.survey$current_party)

c.survey$current_party=ifelse(grepl("PP",c.survey$aid_71, ignore.case = T), "pp", c.survey$current_party)
c.survey$current_party=ifelse(grepl("people",c.survey$aid_71, ignore.case = T), "pp", c.survey$current_party)

c.survey$current_party=ifelse(grepl("dpp",c.survey$aid_71, ignore.case = T), "dpp", c.survey$current_party)
c.survey$current_party=ifelse((c.survey$aid_71<0), "no answer", c.survey$current_party)
c.survey$current_party=ifelse(grepl("decided",c.survey$aid_71, ignore.case = T), "no answer", c.survey$current_party)
c.survey$current_party=ifelse(grepl("classroom",c.survey$aid_71, ignore.case = T), "no answer", c.survey$current_party)

c.survey$gender=c.survey$aid_3

c.survey$education_plan=c.survey$aid_42

c.survey$tribe=ifelse(c.survey$aid_13_opt1=="Yes", "Chewa", NA)
c.survey$tribe=ifelse(c.survey$aid_13_opt1=="Yes", "Lomwe", c.survey$tribe)
c.survey$tribe=ifelse(c.survey$aid_13_opt1=="Yes", "Ngoni", c.survey$tribe)
c.survey$tribe=ifelse(c.survey$aid_13_opt1=="Yes", "Yao", c.survey$tribe)
c.survey$tribe=ifelse(c.survey$aid_13_opt1=="Yes", "Tumbuka", c.survey$tribe)
c.survey$tribe=ifelse(c.survey$aid_13_opt1=="Yes", "Sena", c.survey$tribe)
c.survey$tribe=ifelse(c.survey$aid_13_opt1=="Yes", "Other", c.survey$tribe)


c.survey$tribe_of_constituency=ifelse(c.survey$aid_14_opt1=="Yes", "Chewa", NA)
c.survey$tribe_of_constituency=ifelse(c.survey$aid_14_opt1=="Yes", "Lomwe", c.survey$tribe_of_constituency)
c.survey$tribe_of_constituency=ifelse(c.survey$aid_14_opt1=="Yes", "Ngoni", c.survey$tribe_of_constituency)
c.survey$tribe_of_constituency=ifelse(c.survey$aid_14_opt1=="Yes", "Yao", c.survey$tribe_of_constituency)
c.survey$tribe_of_constituency=ifelse(c.survey$aid_14_opt1=="Yes", "Tumbuka", c.survey$tribe_of_constituency)
c.survey$tribe_of_constituency=ifelse(c.survey$aid_14_opt1=="Yes", "Sena", c.survey$tribe_of_constituency)
c.survey$tribe_of_constituency=ifelse(c.survey$aid_14_opt1=="Yes", "Other", c.survey$tribe_of_constituency)


c.survey$education=c.survey$aid_18

c.survey$income=c.survey$aid_20

c.survey$lenght_of_residence=c.survey$aid_8

mp.survey$children_attend_school=ifelse(nchar(mp.survey$aid_53_1)>0, 1, 0)
mp.survey$family_children_attend_school=ifelse(nchar(mp.survey$aid_54_1)>0, 1, 0)
mp.survey$current_party=ifelse(grepl("mcp",mp.survey$aid_71, ignore.case = T), "mcp", NA)
mp.survey$current_party=ifelse(grepl("congress",mp.survey$aid_71, ignore.case = T), "mcp", mp.survey$current_party)
mp.survey$current_party=ifelse(grepl("independent",mp.survey$aid_71, ignore.case = T), "independent", mp.survey$current_party)
mp.survey$current_party=ifelse(grepl("UDF",mp.survey$aid_71, ignore.case = T), "udf", mp.survey$current_party)

mp.survey$current_party=ifelse(grepl("PP",mp.survey$aid_71, ignore.case = T), "pp", mp.survey$current_party)
mp.survey$current_party=ifelse(grepl("people",mp.survey$aid_71, ignore.case = T), "pp", mp.survey$current_party)

mp.survey$current_party=ifelse(grepl("dpp",mp.survey$aid_71, ignore.case = T), "dpp", mp.survey$current_party)
mp.survey$current_party=ifelse(grepl("FORD",mp.survey$aid_71, ignore.case = T), "aford", mp.survey$current_party)

mp.survey$current_party=ifelse((mp.survey$aid_71<0), "no answer", mp.survey$current_party)
mp.survey$current_party=ifelse(grepl("decided",mp.survey$aid_71, ignore.case = T), "no answer", mp.survey$current_party)
mp.survey$current_party=ifelse(grepl("comment",mp.survey$aid_71, ignore.case = T), "no answer", mp.survey$current_party)

mp.survey$gender=mp.survey$aid_3

mp.survey$education_plan=mp.survey$aid_42


mp.survey$tribe=ifelse(mp.survey$aid_13_opt1=="Yes", "Chewa", NA)
mp.survey$tribe=ifelse(mp.survey$aid_13_opt1=="Yes", "Lomwe", mp.survey$tribe)
mp.survey$tribe=ifelse(mp.survey$aid_13_opt1=="Yes", "Ngoni", mp.survey$tribe)
mp.survey$tribe=ifelse(mp.survey$aid_13_opt1=="Yes", "Yao", mp.survey$tribe)
mp.survey$tribe=ifelse(mp.survey$aid_13_opt1=="Yes", "Tumbuka", mp.survey$tribe)
mp.survey$tribe=ifelse(mp.survey$aid_13_opt1=="Yes", "Sena", mp.survey$tribe)
mp.survey$tribe=ifelse(mp.survey$aid_13_opt1=="Yes", "Other", mp.survey$tribe)


mp.survey$tribe_of_constituency=ifelse(mp.survey$aid_14_opt1=="Yes", "Chewa", NA)
mp.survey$tribe_of_constituency=ifelse(mp.survey$aid_14_opt2=="Yes", "Lomwe", mp.survey$tribe_of_constituency)
mp.survey$tribe_of_constituency=ifelse(mp.survey$aid_14_opt3=="Yes", "Ngoni", mp.survey$tribe_of_constituency)
mp.survey$tribe_of_constituency=ifelse(mp.survey$aid_14_opt4=="Yes", "Yao", mp.survey$tribe_of_constituency)
mp.survey$tribe_of_constituency=ifelse(mp.survey$aid_14_opt5=="Yes", "Tumbuka", mp.survey$tribe_of_constituency)
mp.survey$tribe_of_constituency=ifelse(mp.survey$aid_14_opt6=="Yes", "Sena", mp.survey$tribe_of_constituency)
mp.survey$tribe_of_constituency=ifelse(mp.survey$aid_14_opt7=="Yes", "Other", mp.survey$tribe_of_constituency)


mp.survey$education=mp.survey$aid_18
mp.survey$income=mp.survey$aid_20

mp.survey$lenght_of_residence=mp.survey$aid_8

mp.survey$ps_ward_id=NULL

mp.survey.byschool=merge(mpschools, mp.survey, by="constituencyid", all.x=T)
c.survey.byschool=merge(lcschools, c.survey, by="ps_ward_id", all.x=T)
#some cleanup
mp.survey.byschool$ps_ward=mp.survey.byschool$ps_ward.x
mp.survey.byschool$ps_ward.x=NULL
mp.survey.byschool$ps_ward.y=NULL
mp.survey.byschool$ps_constituency=mp.survey.byschool$ps_constituency.x
mp.survey.byschool$ps_constituency.x=NULL
mp.survey.byschool$ps_constituency.y=NULL
mp.survey.byschool$ps_district=mp.survey.byschool$ps_district.x
mp.survey.byschool$ps_district.x=NULL
mp.survey.byschool$ps_district.y=NULL
mp.survey.byschool$map_district=mp.survey.byschool$map_district.x
mp.survey.byschool$map_district.x=NULL
mp.survey.byschool$map_district.y=NULL

c.survey.byschool$ps_ward=c.survey.byschool$ps_ward.x
c.survey.byschool$ps_ward.x=NULL
c.survey.byschool$ps_ward.y=NULL
c.survey.byschool$ps_constituency=c.survey.byschool$ps_constituency.x
c.survey.byschool$ps_constituency.x=NULL
c.survey.byschool$ps_constituency.y=NULL
c.survey.byschool$ps_district=c.survey.byschool$ps_district.x
c.survey.byschool$ps_district.x=NULL
c.survey.byschool$ps_district.y=NULL
c.survey.byschool$map_district=c.survey.byschool$map_district.x
c.survey.byschool$map_district.x=NULL
c.survey.byschool$map_district.y=NULL

c.survey.byschool$constituencyid=c.survey.byschool$constituencyid.x
c.survey.byschool$constituencyid.x=NULL
c.survey.byschool$constituencyid.y=NULL

mp.survey.byschool$treatment=treatments_mp[match(mp.survey.byschool$school_id, treatments_mp$school_id),"treatment"]
mp.survey.byschool$transparencyblock=treatments_mp[match(mp.survey.byschool$school_id, treatments_mp$school_id),"transparencyblock"]
mp.survey.byschool$transparency_condition=treatments_mp[match(mp.survey.byschool$school_id, treatments_mp$school_id),"Transparency_Condition"]
mp.survey.byschool$map_order=treatments_mp[match(mp.survey.byschool$school_id, treatments_mp$school_id),"Map_Order"]
mp.survey.byschool$good_order=treatments_mp[match(mp.survey.byschool$school_id, treatments_mp$school_id),"Good_Order"]
#mp.survey.byschool$map_order_id=treatments_mp[match(mp.survey.byschool$school_id, treatments_mp$school_id),"map_order_id"]



c.survey.byschool$treatment=treatments_lc[match(c.survey.byschool$school_id, treatments_lc$school_id),"treatment"]
c.survey.byschool$transparencyblock=treatments_lc[match(c.survey.byschool$school_id, treatments_lc$school_id),"transparencyblock"]
c.survey.byschool$transparency_condition=treatments_lc[match(c.survey.byschool$school_id, treatments_lc$school_id),"Transparency_Condition"]
c.survey.byschool$map_order=treatments_lc[match(c.survey.byschool$school_id, treatments_lc$school_id),"Map_Order"]
#c.survey.byschool$map_order_id=treatments_lc[match(c.survey.byschool$school_id, treatments_lc$school_id),"map_order_id"]
c.survey.byschool$good_order=treatments_lc[match(c.survey.byschool$school_id, treatments_lc$school_id),"Good_Order"]





mp.survey.byschool$project_types=paste(mp.survey.byschool$project_type1, mp.survey.byschool$project_type2,
                                       mp.survey.byschool$project_type3, mp.survey.byschool$project_type4,
                                       mp.survey.byschool$project_type5,
                                       sep=",")
mp.survey.byschool$project_capacity_building=ifelse(grepl("Capacity", mp.survey.byschool$project_types), 1, 0)
mp.survey.byschool$project_community_support=ifelse(grepl("Community", mp.survey.byschool$project_types), 1, 0)
mp.survey.byschool$project_health_services=ifelse(grepl("Health", mp.survey.byschool$project_types), 1, 0)
mp.survey.byschool$project_teacher_training=ifelse(grepl("Teacher", mp.survey.byschool$project_types), 1, 0)
mp.survey.byschool$project_gender_issues=ifelse(grepl("Gender", mp.survey.byschool$project_types), 1, 0)
mp.survey.byschool$project_food_provision=ifelse(grepl("Food", mp.survey.byschool$project_types), 1, 0)
mp.survey.byschool$project_construction=ifelse(grepl("Construction", mp.survey.byschool$project_types), 1, 0)


c.survey.byschool$project_types=paste(c.survey.byschool$project_type1, c.survey.byschool$project_type2,
                                      c.survey.byschool$project_type3, c.survey.byschool$project_type4,
                                      c.survey.byschool$project_type5,
                                      sep=",")
c.survey.byschool$project_capacity_building=ifelse(grepl("Capacity", c.survey.byschool$project_types), 1, 0)
c.survey.byschool$project_community_support=ifelse(grepl("Community", c.survey.byschool$project_types), 1, 0)
c.survey.byschool$project_health_services=ifelse(grepl("Health", c.survey.byschool$project_types), 1, 0)
c.survey.byschool$project_teacher_training=ifelse(grepl("Teacher", c.survey.byschool$project_types), 1, 0)
c.survey.byschool$project_gender_issues=ifelse(grepl("Gender", c.survey.byschool$project_types), 1, 0)
c.survey.byschool$project_food_provision=ifelse(grepl("Food", c.survey.byschool$project_types), 1, 0)
c.survey.byschool$project_construction=ifelse(grepl("Construction", c.survey.byschool$project_types), 1, 0)


c.survey.byschool$children_attend_school=lc_childrenschools[match(c.survey.byschool$school_id, lc_childrenschools$school_id),"children_attent_school"]
mp.survey.byschool$children_attend_school=mp_childrenschools[match(mp.survey.byschool$school_id, mp_childrenschools$school_id),"children_attend_school"]


c.survey.byschool$family_children_attend_school=lc_childrenschools[match(c.survey.byschool$school_id, lc_childrenschools$school_id),"family_children_attend_school"]
mp.survey.byschool$family_children_attend_school=mp_childrenschools[match(mp.survey.byschool$school_id, mp_childrenschools$school_id),"family_children_attend_school"]

#rename
mp.survey=mp.survey.byschool
c.survey=c.survey.byschool




#This shoudn't do anything. It was included to correct a merge error which we later fixed.  
#keeping this block in just as a merge validation check
missingschool=mpschools[mpschools$school_id=="a94b50a45176a258d28963729674afd6c8b3a550",]
mp.survey[mp.survey$school_id=="a94b50a45176a258d28963729674afd6c8b3a550","school_name"]=missingschool[missingschool$school_id==4013,"school_name"]


commonNames <- names(mp.survey)[which(colnames(mp.survey) %in% colnames(missingschool))]
commonNames <- commonNames[commonNames != "school_id"]
dfmerge<- merge(mp.survey,missingschool,by="school_id",all=T, suffixes=c("",".update"))
for(i in commonNames){
  left <- paste(i, "", sep="")
  right <- paste(i, ".update", sep="")
  dfmerge[is.na(dfmerge[left]) | dfmerge[left]=="",left] <- dfmerge[is.na(dfmerge[left]) | dfmerge[left]=="",right]
  dfmerge[right]<- NULL
  colnames(dfmerge)[colnames(dfmerge) == left] <- i
}
mp.survey=dfmerge
dfmerge=NULL

#this does a quick check of the merge to make sure that all the values match with what they should
for(id in mpschools$school_id)
{
  for(i in commonNames){
    test=mp.survey[mp.survey$school_id==id, i] == mpschools[mpschools$school_id==id, i]
    
    if(!is.na(test)){
      if(!test){
        test2=as.numeric(mp.survey[mp.survey$school_id==id, i]) - mpschools[mpschools$school_id==id, i]<0.0001
        if(!test2){  
          cat(paste(id, i, "\n"))
        }

        }
    }
  }
}


c.survey$councilorselected=ifelse(c.survey$councilorselected=="TRUE", 1, c.survey$councilorselected)
c.survey$councilorselected=ifelse(c.survey$councilorselected=="FALSE", 0, c.survey$councilorselected)
c.survey$councilorselected=as.integer(c.survey$councilorselected)

mp.survey$mpselected=ifelse(mp.survey$mpselected=="TRUE", 1, mp.survey$mpselected)
mp.survey$mpselected=ifelse(mp.survey$mpselected=="FALSE", 0, mp.survey$mpselected)
mp.survey$mpselected=as.integer(mp.survey$mpselected)

mp.survey$constituencyid=ifelse(is.na(mp.survey$constituencyid),(mp.survey$constituency_id),mp.survey$constituencyid)


c.survey$map_id_temp=paste(c.survey$ps_ward_id, ".", c.survey$map_order_id, sep="")
c.survey$map_id=as.integer(factor(c.survey$map_id_temp))
c.survey$map_id_temp=NULL

mp.survey$map_id_temp=paste(mp.survey$constituency_id, ".", mp.survey$map_order_id, sep="")
mp.survey$map_id=as.integer(factor(mp.survey$map_id_temp))
mp.survey$map_id_temp=NULL

c.survey$district_id=as.integer(factor(c.survey$ps_district))
mp.survey$district_id=as.integer(factor(mp.survey$ps_district))

mp.survey$constituencyid=(mp.survey$constituencyid)
sum(is.na(mp.survey$constituencyid))



c.survey$constituencyid=(c.survey$constituencyid)
sum(is.na(c.survey$constituencyid))
c.survey[is.na(c.survey$constituencyid),"constituencyid"]=mp.survey[match(toupper(trimws(c.survey[is.na(c.survey$constituencyid),]$map_constituency)), toupper(trimws(mp.survey$map_constituency))), "constituencyid"]
sum(is.na(c.survey$constituencyid))

c.survey$ps_ward_id=(c.survey$ps_ward_id)
mp.survey$ps_ward_id=(mp.survey$ps_ward_id)

c.survey=c.survey[order(c.survey$map_id),]
c.survey=c.survey[order(c.survey$map_id),]


c.survey.all=c.survey
#exclude cases that were not enumerated
c.survey=c.survey[!is.na(c.survey$instanceID),]

mp.survey.all=mp.survey
#exclude cases that were not enumerated
mp.survey=mp.survey[!is.na(mp.survey$instanceID),]

#make sure there is a selected school in each map
t=aggregate(c.survey$councilorselected, by=list(c.survey$map_id), FUN=sum)
xtabs(~t$x) #should equal 1

#make sure there is a selected school in each map
t=aggregate(mp.survey$mpselected, by=list(mp.survey$map_id), FUN=sum)
xtabs(~t$x) #should equal 1

#some tests to make sure all maps are complete
c.survey$countme=1
t=aggregate(c.survey$countme, by=list(c.survey$map_id), FUN=sum)
xtabs(~t$x) #should equal 3

mp.survey$countme=1
t=aggregate(mp.survey$countme, by=list(mp.survey$map_id), FUN=sum)
xtabs(~t$x) #should equal 3


#make sure there is a selected school in each map
t=aggregate(c.survey$councilorselected, by=list(c.survey$map_id), FUN=sum)
xtabs(~t$x) 

#these are maps for which the councillor did not provide a response (don't know refused or display issue)
#checked these using the Schools for Lottery files
if(nrow(c.survey[c.survey$map_id %in% t[t$x %in% c(0),]$Group.1,])>0)
{
   c.survey[c.survey$map_id %in% t[t$x %in% c(0),]$Group.1,]$councilorselected=NA
}

#make sure there is a selected school in each map
t=aggregate(mp.survey$mpselected, by=list(mp.survey$map_id), FUN=sum)
xtabs(~t$x) #should equal 1 (see below)

#these are maps for which the councillor did not provide a response (don't know refused or display issue)
#checked these using the Schools for Lottery files
mp.survey[mp.survey$map_id %in% t[t$x %in% c(0),]$Group.1,]$mpselected=NA

#should show a treatment for every map
xtabs(~mp.survey$treatment)
xtabs(~c.survey$treatment)

#without attritted
write.csv(c.survey, "./output/c_clean.csv")
write.csv(mp.survey, "./output/mp_clean.csv")

#with attritted
write.csv(c.survey.all, "./output/c_all.csv")
write.csv(mp.survey.all, "./output/mp_all.csv")

